Identifying duplicate datasets in OBIS

This notebook tries to identify duplicate datasets in the OBIS system. This is done by aggregating datasets by geohash, species, and year, and calculating cosine similarities between datasets. Skip to results here.

grids

How to use this notebook

Delete the vectors directory to calculate vectors based on the latest data.

Dependencies

library(tidyr)
library(glue)
library(Matrix)
library(dplyr)
library(arrow)
library(dplyr)
library(geohashTools)
library(data.table)
library(reactable)

Fetch occurrences from S3

This fetches just the necessary columns from the OBIS export on S3.

space <- S3FileSystem$create(
  anonymous = TRUE,
  scheme = "https",
  endpoint_override = "ams3.digitaloceanspaces.com"
)

occ <- open_dataset(space$path("obis-datasets/exports/obis_20220208.parquet")) %>%
  select(dataset_id, decimalLongitude, decimalLatitude, AphiaID, year = date_year) %>%
  as_tibble()

Aggregate datasets by geohash, species, and year

stats <- occ %>%
  filter(!is.na(year) & decimalLatitude < 90 & decimalLongitude < 180) %>%
  mutate(geohash = gh_encode(decimalLatitude, decimalLongitude, 2)) %>%
  mutate(cell = factor(paste(geohash, AphiaID, year, sep = "_"))) %>%
  group_by(dataset_id, cell) %>%
  summarize(records = n())

Generate vectors and calculate similarity

First generate vectors by dataset and store as sparseVector.

n_cells <- length(levels(stats$cell))
dataset_ids <- unique(stats$dataset_id)

vectors <- list()

for (id in dataset_ids) {
  message(id)
  vector <- rep(0, n_cells)
  dataset <- stats %>%
    filter(dataset_id == id)
  for (i in 1:nrow(dataset)) {
    vector[as.numeric(dataset$cell[i])] <- dataset$records[i]
  }
  vectors[[id]] <- as(vector, "sparseVector")
}

Calculate similarities using parallelization and write the results to a text file.

write("x y similarity", file = "similarity.txt", append = FALSE)

parallel::mclapply(1:length(dataset_ids), function(i) {
  dataset_x <- dataset_ids[i]
  x <- as.vector(vectors[[dataset_x]])
  for (j in (i + 1):length(dataset_ids)) {
    dataset_y <- dataset_ids[j]
    y <- as.vector(vectors[[dataset_y]])
    similarity <- coop::cosine(x, y)
    line <- paste(dataset_x, dataset_y, format(similarity, scientific = FALSE))
    write(line, file = "similarity.txt", append = TRUE)
  }
}, mc.cores = 6)

Plot the similarities to get an idea of the distribution.

similarity <- fread("similarity.txt", sep = " ", header = TRUE)
plot(similarity$similarity)

Create a shortlist of suspect dataset pairs.

datasets <- robis::dataset() %>%
  tidyr::unnest(statistics) %>%
  select(id, url, title, records = Occurrence)

suspect <- similarity %>%
  filter(similarity > 0.85) %>%
  left_join(datasets, by = c("x" = "id")) %>%
  left_join(datasets, by = c("y" = "id"), suffix = c("_x", "_y")) %>%
  arrange(desc(similarity)) %>%
  as_tibble()

suspect 
## # A tibble: 426 × 9
##    x      y      similarity url_x   title_x  records_x url_y  title_y  records_y
##    <chr>  <chr>       <dbl> <chr>   <chr>        <int> <chr>  <chr>        <int>
##  1 01f35… bad2f…          1 http:/… Movemen…    143568 http:… Movemen…         2
##  2 08fb2… 40e33…          1 http:/… Nest ce…      5056 https… Nest ce…      5056
##  3 0a68b… 4b51b…          1 http:/… Microzo…        74 http:… Nationa…        74
##  4 0dd1c… 6bc62…          1 http:/… Leopard…       534 https… Leopard…       534
##  5 0e40e… 48f91…          1 https:… RMT Tra…       534 http:… RMT Tra…       547
##  6 11863… 5928a…          1 https:… Bunger …       140 http:… Bunger …       140
##  7 12abc… d7fa8…          1 http:/… Norfish…       271 http:… Norfish…       410
##  8 13347… 1e5d0…          1 http:/… Phytopl…      1098 http:… Joint O…      1098
##  9 15665… 31b19…          1 http:/… Microzo…        72 http:… Nationa…        72
## 10 15efe… 93176…          1 https:… IOMPMN1…       310 https… IOMPMN1…       310
## # … with 416 more rows

Results

suspect %>%
  mutate(
    title = glue("<a href=\"https://obis.org/dataset/{x}\" target=\"_blank\">{title_x}</a><br/><br/><a href=\"https://obis.org/dataset/{y}\" target=\"_blank\">{title_y}</a>"),
    url = glue("<a href=\"{url_x}\" target=\"_blank\">{url_x}</a><br/><br/><a href=\"{url_y}\" target=\"_blank\">{url_y}</a>"),
    records = glue("{records_x}<br/><br/>{records_y}")) %>%
  select(similarity, title, url, records) %>%
  reactable(columns = list(similarity = colDef(width = 100), title = colDef(html = TRUE), url = colDef(html = TRUE), records = colDef(html = TRUE, width = 100)), pagination = FALSE)